{
 "cells": [
  {
   "cell_type": "code",
   "execution_count": 40,
   "metadata": {
    "collapsed": true,
    "deletable": true,
    "editable": true
   },
   "outputs": [],
   "source": [
    "import pandas as pd\n",
    "import matplotlib.pyplot as plt\n",
    "import jieba\n",
    "import jieba.analyse\n",
    "from sklearn.feature_extraction.text import CountVectorizer, HashingVectorizer, TfidfTransformer,TfidfVectorizer\n",
    "from sklearn.linear_model import Ridge\n",
    "from sklearn.metrics import mean_squared_error\n",
    "from sklearn.preprocessing import OneHotEncoder\n",
    "import scipy\n",
    "from sklearn.model_selection import KFold,StratifiedKFold\n",
    "from scipy.sparse import csr_matrix, hstack\n",
    "import re\n",
    "import numpy as np"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 28,
   "metadata": {
    "collapsed": true
   },
   "outputs": [],
   "source": [
    "stop_word = []\n",
    "stop_words_path = '../input/stop_word.txt'\n",
    "with open(stop_words_path,encoding='utf-8') as f:\n",
    "    for line in f.readlines():\n",
    "        stop_word.append(line.strip())\n",
    "stop_word.append(' ')\n",
    "\n",
    "def clean_str(stri):\n",
    "    stri = re.sub(r'[a-zA-Z0-9]+','',stri)\n",
    "    cut_str = jieba.cut(stri.strip())\n",
    "    list_str = [word for word in cut_str if word not in stop_word]\n",
    "    return ' '.join(list_str)\n",
    "\n",
    "def rmsel(true_label,pred):\n",
    "    rmse = np.sqrt(mean_squared_error(true_label, pred))\n",
    "    return 1 / (1 + rmse)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 29,
   "metadata": {
    "collapsed": true
   },
   "outputs": [],
   "source": [
    "def get_data():\n",
    "    train = pd.read_csv('../input/train_first.csv')\n",
    "    test = pd.read_csv('../input/predict_first.csv')\n",
    "    data = pd.concat([train, test])\n",
    "    print('train %s test %s'%(train.shape,test.shape))\n",
    "    print('train columns',train.columns)\n",
    "    return data,train.shape[0],train['Score'],test['Id']"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 30,
   "metadata": {
    "collapsed": true
   },
   "outputs": [],
   "source": [
    "def split_discuss(data):\n",
    "    data['length'] = data['Discuss'].apply(lambda x:len(x))\n",
    "    data['Discuss'] = data['Discuss'].apply(lambda x:clean_str(x))\n",
    "    return data"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 31,
   "metadata": {
    "collapsed": false
   },
   "outputs": [],
   "source": [
    "def pre_process():\n",
    "    data,nrw_train,y,test_id = get_data()\n",
    "    data = split_discuss(data)\n",
    "    cv = CountVectorizer(ngram_range=(1,2))\n",
    "    discuss = cv.fit_transform(data['Discuss'])\n",
    "    tf = TfidfVectorizer(max_df=10000,ngram_range=(1,2))\n",
    "    discuss_tf = tf.fit_transform(data['Discuss'])\n",
    "    data = hstack((discuss,discuss_tf)).tocsr()\n",
    "    return data[:nrw_train],data[nrw_train:],y,test_id"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 32,
   "metadata": {
    "collapsed": false
   },
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "train (100000, 3) test (30000, 2)\n",
      "train columns Index(['Id', 'Discuss', 'Score'], dtype='object')\n"
     ]
    },
    {
     "name": "stderr",
     "output_type": "stream",
     "text": [
      "c:\\users\\administrator\\appdata\\local\\programs\\python\\python35\\lib\\site-packages\\sklearn\\feature_extraction\\text.py:1059: FutureWarning: Conversion of the second argument of issubdtype from `float` to `np.floating` is deprecated. In future, it will be treated as `np.float64 == np.dtype(float).type`.\n",
      "  if hasattr(X, 'dtype') and np.issubdtype(X.dtype, np.float):\n"
     ]
    }
   ],
   "source": [
    "X,test,y,test_id = pre_process()"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 36,
   "metadata": {
    "collapsed": false
   },
   "outputs": [
    {
     "data": {
      "text/plain": [
       "(100000, 1971262)"
      ]
     },
     "execution_count": 36,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "X.shape"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 50,
   "metadata": {
    "collapsed": false
   },
   "outputs": [],
   "source": [
    "def training(X, y, T):\n",
    "    nfolds = 5\n",
    "    folds = list(StratifiedKFold(n_splits=nfolds, random_state=2018, shuffle=True).split(X, y))\n",
    "    error = []\n",
    "\n",
    "    S_train = np.zeros((X.shape[0], 1))  # 训练样本数 * 模型个数\n",
    "    S_test = np.zeros((T.shape[0], 1))  # 测试集样本数 * 模型个数\n",
    "    S_test_n = np.zeros((T.shape[0], len(folds)))  # 测试集样本数 * n_folds\n",
    "\n",
    "    model = Ridge(solver='auto', fit_intercept=True, alpha=0.4, max_iter=250, normalize=False, tol=0.01)\n",
    "    for j, (train_fold, test_fold) in enumerate(folds):\n",
    "        X_train, X_validate, label_train, label_validate = X[train_fold, :], X[test_fold, :], y[train_fold], y[test_fold]\n",
    "        model.fit(X_train, label_train)\n",
    "\n",
    "        val_ = model.predict(X=X_validate)\n",
    "        pred_ = model.predict(X=T)\n",
    "        rmse_ = rmsel(val_, label_validate)\n",
    "        print(rmse_)\n",
    "        error.append(rmse_)\n",
    "\n",
    "        S_train[test_fold] = np.array(val_).reshape(-1, 1)\n",
    "        S_test_n[:, j] = np.array(pred_)\n",
    "\n",
    "    S_test[:] = S_test_n.mean(1).reshape(-1, 1)\n",
    "    return S_train, S_test, round(np.mean(error), 5)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {
    "collapsed": false
   },
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "0.5776183727627812\n",
      "0.5816785658687482\n",
      "0.5776895445831461\n"
     ]
    }
   ],
   "source": [
    "S_train, S_test, error = training(X, y, test)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {
    "collapsed": true
   },
   "outputs": [],
   "source": [
    "train_df = pd.read_csv('../input/train_first.csv')\n",
    "test_df = pd.read_csv('../input/predict_first.csv')\n",
    "\n",
    "train_out = train_df[['Id']]\n",
    "train_out['ridge_doufu'] = S_train\n",
    "train_out.to_csv('../models/__models__/train_ridge_doufu.csv', index = False)\n",
    "\n",
    "test_out = predict_df[['Id']]\n",
    "test_out['ridge_doufu'] = S_test\n",
    "\n",
    "test_out.to_csv('../models/__models__/test_lstm_len_10.csv', index = False)"
   ]
  }
 ],
 "metadata": {
  "kernelspec": {
   "display_name": "Python 3",
   "language": "python",
   "name": "python3"
  },
  "language_info": {
   "codemirror_mode": {
    "name": "ipython",
    "version": 3
   },
   "file_extension": ".py",
   "mimetype": "text/x-python",
   "name": "python",
   "nbconvert_exporter": "python",
   "pygments_lexer": "ipython3",
   "version": "3.5.2"
  }
 },
 "nbformat": 4,
 "nbformat_minor": 2
}
